df_clean %>%
ggplot(aes(data_grant_share, data_sharing_perc, colour = pubs.vs.data)) +
geom_point(size = 1.5) +
scale_colour_viridis_c(option = "C")
# why does the right side not go up higher? is this the base funding that
# stays stronger for those with not many grants?
p <- df_clean %>%
mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
ggplot(aes(data_grant_share, data_sharing_perc, colour = factor(pubs.vs.data),
label = stats)) +
geom_point(alpha = .5) +
scale_colour_viridis_d(option = "C")
p
plotly::ggplotly(p)
df_clean %>%
mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
ggplot(aes(data_grant_share, data_sharing_perc, label = stats)) +
geom_point(alpha = .1) +
facet_wrap(vars(factor(pubs.vs.data))) +
labs(x = "% of grants that mandate data sharing",
y = "% of publications with shared data")
df_clean %>%
mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
ggplot(aes(data_grant_share, data_sharing_perc, label = stats)) +
geom_density_2d() +
facet_wrap(vars(factor(pubs.vs.data))) +
labs(x = "% of grants that mandate data sharing",
y = "% of publications with shared data")
Rewarding based on datasets simply adds more randomness.
df_clean %>%
ggplot(aes(data_grant_share, n_publications)) +
geom_point() +
facet_wrap(vars(pubs.vs.data)) +
labs(title = "Influence of incentive settings",
x = "% of grants that mandate data sharing",
y = "# of total publications")
m1 <- lm(n_publications ~ total_grants + data_grant_share, data = df_clean)
summary(m1)
##
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share,
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -130.89 -13.09 0.28 13.33 158.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.164550 0.963725 162.04 <2e-16 ***
## total_grants 5.903346 0.001857 3179.15 <2e-16 ***
## data_grant_share -28.204328 1.882182 -14.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.35 on 11997 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
## F-statistic: 5.056e+06 on 2 and 11997 DF, p-value: < 2.2e-16
# have to use rescale directly, since arm::standardize does not work with the
# targets workflow
m1 <- lm(n_publications ~
arm::rescale(total_grants) +
arm::rescale(data_grant_share),
data = df_clean)
summary(m1)
##
## Call:
## lm(formula = n_publications ~ arm::rescale(total_grants) + arm::rescale(data_grant_share),
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -130.89 -13.09 0.28 13.33 158.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 614.4616 0.2131 2882.88 <2e-16 ***
## arm::rescale(total_grants) 1355.8297 0.4265 3179.15 <2e-16 ***
## arm::rescale(data_grant_share) -6.3907 0.4265 -14.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.35 on 11997 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
## F-statistic: 5.056e+06 on 2 and 11997 DF, p-value: < 2.2e-16
m2 <- lm(n_publications ~ total_grants + data_grant_share +
I(total_grants^2),
data = df_clean)
summary(m2)
##
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share +
## I(total_grants^2), data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.622 -13.171 -0.035 13.007 159.018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.579e+02 9.702e-01 162.80 <2e-16 ***
## total_grants 5.853e+00 4.674e-03 1252.20 <2e-16 ***
## data_grant_share -2.777e+01 1.872e+00 -14.84 <2e-16 ***
## I(total_grants^2) 1.044e-04 8.874e-06 11.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.22 on 11996 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
## F-statistic: 3.41e+06 on 3 and 11996 DF, p-value: < 2.2e-16
# m2 fits better, but has too high VIF
car::vif(m2)
## total_grants data_grant_share I(total_grants^2)
## 6.413794 1.001205 6.410746
# so we stick with model 1
# however, there might be an interaction between the share of grants and the
# total number of grants
m3 <- lm(n_publications ~
arm::rescale(total_grants) *
arm::rescale(data_grant_share),
data = df_clean)
summary(m3)
##
## Call:
## lm(formula = n_publications ~ arm::rescale(total_grants) * arm::rescale(data_grant_share),
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -122.390 -12.998 0.184 13.147 155.050
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 614.9651 0.2115
## arm::rescale(total_grants) 1357.9269 0.4335
## arm::rescale(data_grant_share) -23.8306 0.9899
## arm::rescale(total_grants):arm::rescale(data_grant_share) -70.2500 3.6110
## t value Pr(>|t|)
## (Intercept) 2908.25 <2e-16 ***
## arm::rescale(total_grants) 3132.20 <2e-16 ***
## arm::rescale(data_grant_share) -24.07 <2e-16 ***
## arm::rescale(total_grants):arm::rescale(data_grant_share) -19.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.99 on 11996 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9989
## F-statistic: 3.477e+06 on 3 and 11996 DF, p-value: < 2.2e-16
AIC(m3, m1)
## df AIC
## m3 5 109301.5
## m1 4 109672.3
# m3 is clearly better
df_nested <- df_clean %>%
group_by(pubs.vs.data) %>%
nest()
modeling_fun <- function(df) {
m <- lm(n_publications ~ total_grants*data_grant_share,
data = df)
m
}
df_nested <- df_nested %>%
mutate(model = map(data, modeling_fun),
glance = map(model, broom::glance),
tidy = map(model, broom::tidy, conf.int = TRUE))
df_nested
## # A tibble: 6 x 5
## # Groups: pubs.vs.data [6]
## pubs.vs.data data model glance tidy
## <dbl> <list> <list> <list> <list>
## 1 0.4 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 2 0.2 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 3 0.6 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 4 0.8 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 5 1 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 6 0 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
df_nested %>%
unnest(glance)
## # A tibble: 6 x 16
## # Groups: pubs.vs.data [6]
## pubs.vs.data data model r.squared adj.r.squared sigma statistic p.value df
## <dbl> <lis> <lis> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.4 <tib~ <lm> 0.999 0.999 22.5 601588. 0 3
## 2 0.2 <tib~ <lm> 0.999 0.999 22.5 569099. 0 3
## 3 0.6 <tib~ <lm> 0.999 0.999 22.9 656012. 0 3
## 4 0.8 <tib~ <lm> 0.999 0.999 23.1 647773. 0 3
## 5 1 <tib~ <lm> 0.999 0.999 22.6 690781. 0 3
## 6 0 <tib~ <lm> 0.998 0.998 23.4 368577. 0 3
## # ... with 7 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
## # deviance <dbl>, df.residual <int>, nobs <int>, tidy <list>
# plot it
dummy_data <- tibble(
total_grants = rep(1:50, 5),
data_grant_share = rep(c(0, .25, .5, .75, 1), each = 50)
)
df_nested %>%
mutate(new_data = list(dummy_data)) %>%
mutate(prediction = map2(model, new_data, ~broom::augment(.x, newdata = .y))) %>%
unnest(prediction) %>%
ggplot(aes(total_grants, .fitted, colour = as.factor(data_grant_share))) +
geom_line() +
geom_point() +
facet_wrap(vars(pubs.vs.data)) +
labs(y = "Predicted number of publications",
colour = "% of grants mandating data sharing",
title = "Productivity depending on data sharing incentives") +
theme(legend.position = "top")
As you incentivise data sharing this way, those groups that actually solely get funded by funders mandating OD fare worse than others.
explanation for the lower inequality in the system: actors get disadvantaged and therefore randomness takes a larger share of the system
important to note: early on there seems to be an advantage, but it quickly goes away and turns into a disadvantage.
so the interesting thing could be to give agents strategies (always share data if last n grants had data sharing, so to have a longer term effect, vs maybe also simply directly adhering), and then to see which share of funders mandating data we need to see a change.
also, maybe having those that always share data, and seeing how they are affected
–> See exploration script for now